Load .RData

load(file = "gsea_output.RData")

# Add variable containing number of genes in leading edge subset
score_df$nLeadingEdge <- as.numeric(lapply(score_df$leadingEdge, length))

Load .RData file from gsea_source.R analysis.

Distribution of GSEA output

# Enrichment score distribution
hist(score_df$ES, breaks = 50)

# Normalized Enrichment score distribution
hist(score_df$NES, breaks = 50)

# Nominal P Value distribution
hist(score_df$pval, breaks = 50)

# FDR q value distribution
hist(score_df$padj, breaks = 50)

# Number of more extreme gene sets distribution
hist(score_df$nMoreExtreme, breaks = 50)

# Size of gene set distribution
hist(score_df$size, breaks = 50)

# Size of leading edge subset distribution
hist(score_df$nLeadingEdge, breaks = 50)

We created histograms of each of the numeric variables from the GSEA output. This demonstrated the bimodal distributions for ES and NES as well as the changes in distributions between the nominal p-values and fdr values.

Pairs of variables

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Filter data set using fdr cutoff
cutoff <- 1
filter_scores <- score_df %>% 
  filter(padj <= cutoff)
summary(filter_scores[,c(2:7,9)])
##       pval               padj               ES               NES         
##  Min.   :0.000999   Min.   :0.01413   Min.   :-0.9699   Min.   :-2.3518  
##  1st Qu.:0.056920   1st Qu.:0.22727   1st Qu.:-0.3132   1st Qu.:-0.9509  
##  Median :0.335093   Median :0.66828   Median : 0.2795   Median : 0.7641  
##  Mean   :0.388967   Mean   :0.57319   Mean   : 0.1297   Mean   : 0.3322  
##  3rd Qu.:0.688758   3rd Qu.:0.91816   3rd Qu.: 0.4920   3rd Qu.: 1.3075  
##  Max.   :1.000000   Max.   :1.00000   Max.   : 0.9620   Max.   : 2.8764  
##   nMoreExtreme        size          nLeadingEdge   
##  Min.   :  0.0   Min.   :   5.00   Min.   :  1.00  
##  1st Qu.: 23.0   1st Qu.:  10.00   1st Qu.:  4.00  
##  Median :147.0   Median :  19.00   Median :  7.00  
##  Mean   :202.8   Mean   :  45.01   Mean   : 13.93  
##  3rd Qu.:324.2   3rd Qu.:  41.00   3rd Qu.: 13.00  
##  Max.   :919.0   Max.   :1639.00   Max.   :362.00
# Size of gene set vs size of leading edge subset
size_map <- ggplot(score_df, 
                   mapping = aes(size, nLeadingEdge, alpha = 0.01, label = pathway)) +
  geom_point() +
  ggtitle("Gene set size vs. size of leading edge subset")

plotly::ggplotly(size_map)
# Normalized vs Observed Enrichment Score
ggplot(score_df) +
  geom_histogram(aes(ES, alpha = 0.05, fill = "ES"), 
                 bins = 50) +
  geom_histogram(aes(NES, alpha = 0.05, fill = "NES"), 
                 bins = 50) +
  ggtitle("Distribution of ES(S) and NES(S) values") +
  scale_fill_manual(name = element_blank(), 
                    values = c("ES" = "red", "NES" = "blue"),
                    labels = c("ES(S)", "NES(S)")) +
  theme(axis.title.x = element_blank())

We see a fairly strong positive correlation between gene set size and the size of the leading edge subset for that gene set, as was expected. The majority of observations have a gene set size under 100 and leading edge subset under 50.

The second plot shows the ES histogram overlayed with the NES histogram. Both, as noted previously, exhibit a bimodal distribution, but we see that the NES distribution has a noticeably wider spread to the data, making extreme values much more apparent.

FDR and gene set Size

# Plot fdr and gene set size together
fdr_size_plot <- 
  ggplot(
    data = score_df, 
    mapping = aes(padj, size, alpha = 0.05)
    ) +
  geom_point(colour = "darkblue") +
  ggtitle("False Discovery Rate and Gene Set Size")

plotly::ggplotly(fdr_size_plot)
# FDR distribution for gene sets over 100 
large_sets <- score_df %>%
  filter(size >= 100)
hist(large_sets$padj)

plot(large_sets$padj, large_sets$size)

cor.test(large_sets$padj, large_sets$size)
## 
##  Pearson's product-moment correlation
## 
## data:  large_sets$padj and large_sets$size
## t = 0.037464, df = 141, p-value = 0.9702
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.161077  0.167217
## sample estimates:
##         cor 
## 0.003155039

There does not seem to be any significant relationship between a gene set’s size and its false discovery rate. Filtering to include only the gene sets with size over 100, it seems that there are significantly more observations with fdr values close to zero; however, the two variables are not linearly correlated even after filtering the data.

Enrichment plots

# Create enrichment plot for selected gene set/pathway
set <- "REACTOME_RHOA_GTPASE_CYCLE"

enrichment_plot <- fgsea::plotEnrichment(
  pathway = pathways[[set]], 
  stats = mageck_lfc_sort,                    
  gseaParam = 1
  )
enrichment_plot

enrichment_plot2 <- fgsea::plotEnrichment(
  pathway = pathways[["WP_ENDOCHONDRAL_OSSIFICATION"]], 
  stats = mageck_lfc_sort,                    
  gseaParam = 1
  )
enrichment_plot2

enrichment_plot3 <- fgsea::plotEnrichment(
  pathway = pathways[["REACTOME_HOMOLOGY_DIRECTED_REPAIR"]], 
  stats = mageck_lfc_sort,                    
  gseaParam = 1
  )
enrichment_plot3

enrichment_plot4 <- fgsea::plotEnrichment(
  pathway = pathways[["REACTOME_G0_AND_EARLY_G1"]], 
  stats = mageck_lfc_sort,                    
  gseaParam = 1
  )
enrichment_plot4

Viewing pathways with extreme values

# View pathways with lowest fdr values
gsea_by_fdr <- score_df[order(score_df$padj, decreasing = FALSE),]
head(gsea_by_fdr)
##                                                                                        pathway
## 1:                                                                        BIOCARTA_MCM_PATHWAY
## 2:                                                                    BIOCARTA_ATRBRCA_PATHWAY
## 3: REACTOME_TRANSLESION_SYNTHESIS_BY_Y_FAMILY_DNA_POLYMERASES_BYPASSES_LESIONS_ON_DNA_TEMPLATE
## 4:                   REACTOME_RECOGNITION_OF_DNA_DAMAGE_BY_PCNA_CONTAINING_REPLICATION_COMPLEX
## 5:                                                      REACTOME_TRANSLESION_SYNTHESIS_BY_POLH
## 6:       REACTOME_RESOLUTION_OF_AP_SITES_VIA_THE_MULTIPLE_NUCLEOTIDE_PATCH_REPLACEMENT_PATHWAY
##           pval       padj        ES      NES nMoreExtreme size
## 1: 0.001543210 0.01413043 0.8742624 2.084541            0   13
## 2: 0.001490313 0.01413043 0.7606050 1.992289            0   19
## 3: 0.001388889 0.01413043 0.7902764 2.312725            0   33
## 4: 0.001416431 0.01413043 0.8090642 2.248785            0   26
## 5: 0.001562500 0.01413043 0.8793194 2.199676            0   16
## 6: 0.001466276 0.01413043 0.8198164 2.175772            0   21
##                                leadingEdge nLeadingEdge
## 1:       orc6,orc5,orc2,orc4,mcm7,mcm4,...           11
## 2: chek1,rad9a,rad51,mre11a,rad1,rad50,...           11
## 3:       pole,rpa3,rfc5,vcp,pcna,pold2,...           20
## 4:     pole,rpa3,rfc5,pcna,wdr48,pold2,...           20
## 5:      rpa3,rfc5,vcp,pcna,rfc2,rps27a,...           12
## 6:      pole,rpa3,rfc5,pcna,pold2,rfc2,...           18
# View extreme pathways by ES and NES
gsea_by_ES <- score_df[order(score_df$ES, decreasing = TRUE),]
head(gsea_by_ES[,c("pathway", "ES", "NES", "padj")])
##                                                                          pathway
## 1:                         REACTOME_CDC6_ASSOCIATION_WITH_THE_ORC_ORIGIN_COMPLEX
## 2:             REACTOME_ASSEMBLY_OF_THE_ORC_COMPLEX_AT_THE_ORIGIN_OF_REPLICATION
## 3:                                              REACTOME_PHOSPHORYLATION_OF_EMI1
## 4: REACTOME_SLBP_DEPENDENT_PROCESSING_OF_REPLICATION_DEPENDENT_HISTONE_PRE_MRNAS
## 5:                   REACTOME_MISMATCH_REPAIR_MMR_DIRECTED_BY_MSH2_MSH3_MUTSBETA
## 6:                                                REACTOME_DNA_STRAND_ELONGATION
##           ES      NES       padj
## 1: 0.9619646 2.026453 0.01484389
## 2: 0.9539566 2.069101 0.01462986
## 3: 0.9291420 1.798997 0.02738644
## 4: 0.9267651 2.060102 0.01446076
## 5: 0.9169753 1.931680 0.01484389
## 6: 0.9045440 2.400637 0.01413043
tail(gsea_by_ES[,c("pathway", "ES", "NES", "padj")])
##                                                                      pathway
## 1: REACTOME_ASSOCIATION_OF_TRIC_CCT_WITH_TARGET_PROTEINS_DURING_BIOSYNTHESIS
## 2:                                                   BIOCARTA_IL22BP_PATHWAY
## 3:                                                     BIOCARTA_IFNG_PATHWAY
## 4:                                                    BIOCARTA_STAT3_PATHWAY
## 5:                                                     BIOCARTA_TERT_PATHWAY
## 6:                                                     BIOCARTA_IFNA_PATHWAY
##            ES       NES       padj
## 1: -0.7548374 -1.861285 0.02067350
## 2: -0.7684930 -1.894957 0.02067350
## 3: -0.7987253 -1.726270 0.10005863
## 4: -0.8271995 -2.005774 0.02034230
## 5: -0.8367602 -1.934245 0.02019148
## 6: -0.9699129 -2.351821 0.02034230
gsea_by_NES <- score_df[order(score_df$NES, decreasing = TRUE),]
head(gsea_by_NES[,c("pathway", "ES", "NES", "padj")])
##                                                                   pathway
## 1:                                             REACTOME_METABOLISM_OF_RNA
## 2: REACTOME_MAJOR_PATHWAY_OF_RRNA_PROCESSING_IN_THE_NUCLEOLUS_AND_CYTOSOL
## 3:                                      REACTOME_HOMOLOGY_DIRECTED_REPAIR
## 4:                                   REACTOME_NONSENSE_MEDIATED_DECAY_NMD
## 5:                                                 REACTOME_MRNA_SPLICING
## 6:               REACTOME_PROCESSING_OF_CAPPED_INTRON_CONTAINING_PRE_MRNA
##           ES      NES       padj
## 1: 0.7028721 2.876448 0.01413043
## 2: 0.7638078 2.857947 0.01413043
## 3: 0.7856302 2.787720 0.01413043
## 4: 0.7736164 2.770933 0.01413043
## 5: 0.7307581 2.766267 0.01413043
## 6: 0.7087467 2.761222 0.01413043
tail(gsea_by_NES[,c("pathway", "ES", "NES", "padj")])
##                                                        pathway         ES
## 1:                                       BIOCARTA_TERT_PATHWAY -0.8367602
## 2: WP_DYSREGULATED_MIRNA_TARGETING_IN_INSULINPI3KAKT_SIGNALING -0.5890128
## 3:                                WP_ENDOCHONDRAL_OSSIFICATION -0.5013328
## 4:                                      BIOCARTA_STAT3_PATHWAY -0.8271995
## 5:                  REACTOME_CD28_DEPENDENT_PI3K_AKT_SIGNALING -0.7274447
## 6:                                       BIOCARTA_IFNA_PATHWAY -0.9699129
##          NES       padj
## 1: -1.934245 0.02019148
## 2: -1.970958 0.02602603
## 3: -1.978433 0.03381908
## 4: -2.005774 0.02034230
## 5: -2.321345 0.02429907
## 6: -2.351821 0.02034230

Code for gene look-up

# 
# # Look-up by gene code
# gene <- "musk"
# if (length(mageck[mageck$id ==gene,]) != 0){
#   print(mageck[mageck$id==gene,])
# } else {
#   print("No results found")
# }